
#############################
## 04_create_finaldata.R ####
#############################


# SETUP 
# prevent from using scientific notation for preferences
options(scipen=10)
# load packages
library(pacman)
p_load(here, dplyr, ggplot2, stringr, modelsummary, cowplot, stargazer,
       bit64, gridExtra, estimatr, tidyr, kableExtra, MatchIt)
i_am("Code/04_create_finaldata.R")


# Load, recode, and export data
load(here("Data", "juror_level.RData"))
load(here("Data", "juror_mentions_byround.RData"))

# merge in jury racial composition
tab1 <- left_join(tab1, juror %>% select(jurynum, whites) %>% distinct())
# set preference mentions to 0 if in set commonly used in other ways
tab1 <- tab1 %>%
  mutate(own_pref_mentions = case_when((own_pref %in% c(200000, 100000000, 200000000))~NA_integer_,
                                       T~own_pref_mentions)) %>%
  mutate(fore_pref_mentions = case_when((own_pref %in% c(200000, 100000000, 200000000))~NA_integer_,
                                        T~fore_pref_mentions)) %>%
  # recode racial composition
  mutate(whites_cat = case_when(whites==6~"6 whites",
                                whites==5~"5 whites",
                                whites<=4~"Less than 5 whites"))
# count total mentions of preferences
tab1$total_pref_mentions <- unlist(lapply(tab1$nums, function(x) length(unlist(x))))
# remove duplicate rows
tab1 <- tab1 %>% distinct()

# count others on jury who share preference
tab1$num_others_pref <- NA
for(i in 1:nrow(tab1)){
  j_pref <- tab1$own_pref[tab1$jurynum==tab1$jurynum[i]&tab1$round==tab1$round[i]]
  tab1$num_others_pref[i] <- sum(j_pref==tab1$own_pref[i])-1
}

# add up number of attributed words by jury 
tab1 <-tab1 %>%
  group_by(jurynum, round) %>%
  mutate(total_words_attr_round=sum(length))
# indicator for foreperson
tab1$fore <- as.numeric(grepl("fore", tolower(tab1$Identifier)))

# recode first turn to be first attributed turn
tab1 <- tab1 %>%
  group_by(jurynum, round) %>%
  mutate(min_turn=min(first_turn, na.rm=T)) %>%
  ungroup %>%
  mutate(first_turn = first_turn - min_turn + 1)

# code jury-level variables about preferences and mentions
jury <- juror %>%
  group_by(jurynum) %>%
  mutate(median_doll = median(idoll, na.rm=T),
         median_scale = median(iscale, na.rm=T),
         doll_prefs = paste0(idoll, collapse=" "),
         scale_prefs=paste0(iscale, collapse=" ")) %>%
  ungroup() %>%
  select(jurynum, whites, nonwhites, jrydoll, jryscale, order, scenario, 
         scale_mentions:total_doll_mentions, median_doll:scale_prefs) %>%
  distinct() 

# reshape jury-level data so 1 row = 1 jury, 1 deliberation round
jury_byround <- jury %>%
  pivot_longer(c(scale_length, doll_length), names_to = "scale", values_to="length") %>% 
  mutate(scale = str_remove(scale, "_length")) %>%
  mutate(mentions = case_when(scale=="scale"~scale_mentions,
                              scale=="doll"~doll_mentions)) %>%
  mutate(mentions_loc = case_when(scale=="scale"~scale_mentions_loc,
                                  scale=="doll"~doll_mentions_loc)) %>%
  mutate(total_mentions = case_when(scale=="scale"~total_scale_mentions,
                                    scale=="doll"~total_doll_mentions)) %>%
  mutate(med_pref = case_when(scale=="scale"~median_scale,
                              scale=="doll"~median_doll)) %>%
  mutate(prefs = case_when(scale=="scale"~scale_prefs,
                           scale=="doll"~doll_prefs)) %>%
  select(-c(scale_mentions:doll_prefs)) 

# recode jury-level composition and round variables
jury_byround <- jury_byround %>%
  mutate(whites_cat = case_when(whites==6~"6 whites",
                                whites==5~"5 whites",
                                whites<=4~"Less than 5 whites")) %>%
  mutate(round = case_when(order=="A"&scale=="doll"~1,
                           order=="A"&scale=="scale"~2,
                           order=="B"&scale=="scale"~1,
                           order=="B"&scale=="doll"~2))

# reshape juror-level data so 1 row = 1 juror, 1 deliberation round
tab2 <- juror %>% 
  pivot_longer(cols=c(scale_mentions, doll_mentions), names_to="scale", values_to="mentions") %>%
  mutate(scale = recode(scale, "scale_mentions"="scale", "doll_mentions"="doll")) %>%
  mutate(total_mentions = case_when(scale=="scale"~total_scale_mentions,
                                    scale=="doll"~total_doll_mentions)) %>%
  mutate(pref_mentions = case_when(scale=="scale"~num_scale_mentions,
                                   scale=="doll"~num_doll_mentions)) %>%
  mutate(round = case_when(order=="A"&scale=="doll"~1,
                           order=="A"&scale=="scale"~2,
                           order=="B"&scale=="scale"~1,
                           order=="B"&scale=="doll"~2)) %>%
  mutate(own_pref = case_when(scale=="doll"~as.integer(idoll),
                              scale=="scale"~as.integer(iscale))) %>%
  mutate(num_others_pref = case_when(scale=="doll"~num_others_doll, 
                                     scale=="scale"~num_others_scale)) %>%
  mutate(dissenter = case_when(scale=="doll"~dissenter_doll, 
                               scale=="scale"~dissenter_scale)) %>%
  mutate(unique = case_when(scale=="doll"~unique_doll,
                            scale=="scale"~unique_scale)) %>%
  mutate(ordinal_pref = case_when(scale=="doll"~as.integer(or_idoll),
                                  scale=="scale"~as.integer(iscale))) %>%
  mutate(first_pref_prefs = case_when(scale=="scale"~first_scale_prefs,
                                      scale=="doll"~first_doll_prefs)) %>%
  mutate(first_pref_words = case_when(scale=="scale"~first_scale_words,
                                      scale=="doll"~first_doll_words)) %>%
  mutate(last_pref_prefs = case_when(scale=="scale"~last_scale_prefs,
                                     scale=="doll"~last_doll_prefs)) %>%
  mutate(last_pref_words = case_when(scale=="scale"~last_scale_words,
                                     scale=="doll"~last_doll_words)) %>%
  group_by(jurynum, round) %>%
  mutate(predelib_median=median(ordinal_pref)) %>%
  ungroup() %>%
  mutate(whites_cat = case_when(whites==6~"6 whites",
                                whites==5~"5 whites",
                                whites<=4~"Less than 5 whites")) %>%
  select(-total_scale_mentions, -total_doll_mentions, -num_scale_mentions, -num_doll_mentions, 
         -idoll, -iscale, -num_others_doll, -num_others_scale) 

# merge in juror-level variables
tab1 <- left_join(tab1, tab2 %>% select(case_id, round, unique, ordinal_pref) %>% mutate(round = as.character(round)))

# load data on all jurors, not just identified
allj <- read.csv(here("Data", "JuryDataSummer2004.csv"))
# merge in composition data on other variables
tab2 <- left_join(tab2, allj %>% select(case_id, income, age, sex, educ, ethnic))
tab1 <- left_join(tab1, allj %>% select(case_id, income, age, sex, educ, ethnic))

# merge in additional variables
tab1 <- left_join(tab1, tab2 %>% select(jurynum, predelib_median, round) %>% mutate(round=as.character(round)) %>% distinct())
# calculate proportion of words attributed
tab1$length_prop <- tab1$length/tab1$total_words_attr_round

# calculate jury-level compositions
jcounts <- allj %>%
  group_by(jurynum) %>%
  mutate(totyoung = sum(age==1),
         totfemale = sum(sex==2),
         totwhite = sum(white),
         toths = sum(educ %in% c(1,2)),
         totinclow = sum(income %in% c(1,2,3))) %>%
  mutate(female12 = totfemale %in% c(1,2),
         female3 = totfemale==3,
         female4 = totfemale==4,
         female56 = totfemale %in% c(5,6),
         white6 = totwhite==6,
         white5 = totwhite==5,
         white24 = totwhite<5,
         young0 = totyoung==0,
         young1 = totyoung==1,
         young2 = totyoung==2,
         young34 = totyoung %in% c(3,4),
         hs0 = toths==0,
         hs1 = toths==1,
         hs2 = toths==2,
         hs35 = toths %in% c(3,4,5),
         lowinc0 = totinclow==0,
         lowinc1 = totinclow==1,
         lowinc2 = totinclow==2,
         lowinc3 = totinclow==3,
         lowinc45 = totinclow %in% c(4,5)) %>%
  select(jurynum, female12:lowinc45) %>%
  distinct()
# merge counts into jury-level data
jury_byround <- left_join(jury_byround, jcounts)
# create jury-level preference variation variable
jury_byround <- left_join(jury_byround, tab2 %>%
                            group_by(jurynum) %>%
                            summarize(sd_pre = sd(ordinal_pref)) %>%
                            distinct() %>%
                            select(jurynum, sd_pre))

# recode demographic variables
tab2 <- tab2 %>% 
  mutate(educ = case_when(educ<3~"HS or less",
                          educ==3~"Some coll.",
                          educ>3~"BA or more")) %>%
  mutate(age = case_when(age<3~"18-39",
                         age %in% c(3,4)~"40-59",
                         age>4~"60+")) %>%
  mutate(income = case_when(income<=3~"30k or less",
                            income==4~"30-50k",
                            income>4~"more than 50k")) %>%
  mutate(sex = case_when(sex==1~"M",
                         sex==2~"F"))
tab1 <- tab1 %>% 
  mutate(educ = case_when(educ<3~"HS or less",
                          educ==3~"Some coll.",
                          educ>3~"BA or more")) %>%
  mutate(age = case_when(age<3~"18-39",
                         age %in% c(3,4)~"40-59",
                         age>4~"60+")) %>%
  mutate(income = case_when(income<=3~"30k or less",
                            income==4~"30-50k",
                            income>4~"more than 50k")) %>%
  mutate(sex = case_when(sex==1~"M",
                         sex==2~"F"))
# code pre-deliberation median preference
tab2 <- tab2 %>%
  group_by(jurynum, round) %>%
  mutate(predelib_mean = mean(ordinal_pref))
# rename type variable
tab2$type <- tab2$scale
# recode type variable
tab1 <- tab1 %>%
  mutate(scale=case_when(type=="Scale"~"scale",
                         type=="Dollars"~"doll"))
# merge in additional variables
tab1 <- left_join(tab1, tab2 %>% select(case_id, round, ordinal_pref, predelib_mean) %>% mutate(round = as.character(round)))

# calculate juror distance from median
tab1$distance <- abs(tab1$predelib_median - tab1$ordinal_pref)
tab2$distance <- abs(tab2$predelib_median - tab2$ordinal_pref)

# recode race categories
tab1$whites_cat <- relevel(factor(tab1$whites_cat), ref="Less than 5 whites")
tab2$whites_cat <- relevel(factor(tab2$whites_cat), ref="Less than 5 whites")


# find jurors with prefs within 2 ordinal scale points 
tab2$close_prefs <- NA
tab2$num_others_close <- NA

# take others on jury and find t hoes within 2 scale points, excluding NAs
for(i in 1:nrow(tab2)){
  if(!is.na(tab2$own_pref[i])){
    others <- tab2$own_pref[tab2$jurynum==tab2$jurynum[i]&tab2$round==tab2$round[i]]
    others_diff <- tab2$ordinal_pref[i] - tab2$ordinal_pref[tab2$jurynum==tab2$jurynum[i]&tab2$round==tab2$round[i]]
    closest <- unique(others[abs(others_diff)<=1&others!=tab2$own_pref[i]&!is.na(others)])
    if(length(closest)==0)
      tab2$num_others_close[i] <- 0
    if(length(closest)>0){
      tab2$close_prefs[[i]] <- list(closest)
      tab2$num_others_close[i] <- length(others[others %in% closest]) 
    }
  }
}

# merge with full data
tab1 <- left_join(tab1, tab2 %>% 
                    select(case_id, round, close_prefs, 
                           num_others_close) %>%
                    mutate(round = as.character(round)))
tab2 <- left_join(tab2 %>% mutate(round = as.character(round)), 
                  tab1 %>% select(case_id,round,jury_prop_words_attributed))

# find jury mentions of white and nonwhite prefs
tab2$num_white_mentions <- NA
tab2$num_nonwhite_mentions <- NA
for(i in 1:length(unique(tab2$jurynum))){
  jnum <- unique(tab2$jurynum)[i]
  for(k in 1:2){
    white_prefs <- tab2$own_pref[tab2$jurynum==jnum&tab2$round==k&tab2$white==1]
    nonwhite_prefs <- tab2$own_pref[tab2$jurynum==jnum&tab2$round==k&tab2$white==0]
    tab2$num_white_mentions[tab2$jurynum==jnum&tab2$round==k] <-  sum(unlist(tab2$mentions[tab2$jurynum==jnum&tab2$round==k][1]) %in% white_prefs)
    tab2$num_nonwhite_mentions[tab2$jurynum==jnum&tab2$round==k] <-  sum(unlist(tab2$mentions[tab2$jurynum==jnum&tab2$round==k][1]) %in% nonwhite_prefs)
  }
}


# find mentions of individual prefs by white and nonwhite jurors

for(i in 1:nrow(tab1)){
  # mentions among same jury, same round, white, not focal juror
  white_mens <- unlist(tab1$nums[tab1$jurynum==tab1$jurynum[i]&tab1$round==tab1$round[i]&tab1$white==1&tab1$case_id!=tab1$case_id[i]])
  tab1$white_mens[i] <- sum(white_mens==tab1$own_pref[i], na.rm=T)
  tab1$total_white_mens[i] <- length(white_mens)
  # mentions among same jury, same round, POC, not focal juror
  poc_mens <- unlist(tab1$nums[tab1$jurynum==tab1$jurynum[i]&tab1$round==tab1$round[i]&tab1$white==0&tab1$case_id!=tab1$case_id[i]])
  tab1$poc_mens[i] <- sum(poc_mens==tab1$own_pref[i], na.rm=T)
  tab1$total_poc_mens[i] <- length(poc_mens)
}

# find mentions of preferences far from the focal juror's

tab2$far_mentions <- NA
tab2$num_others_far <- NA
for(i in 1:nrow(tab2)){
  others_far <- which(tab2$jurynum==tab2$jurynum[i]&tab2$round==tab2$round[i]&(abs(tab2$ordinal_pref-tab2$ordinal_pref[i])>3))
  others_prefs <- unique(tab2$own_pref[others_far])
  tab2$num_others_far[i] <- length(others_far)
  if(tab2$num_others_far[i]>0){
    tab2$far_mentions[i] <- sum(str_count(paste0(" ", as.character(unlist(tab2$mentions[[i]])), " ", collapse=" "), paste(" ", as.character(as.integer(others_prefs)), " ", sep="")))
  }
}


# write out datasets (minus list columns) for analysis
  # remove verbatim text to protect individual jurors
write.csv(tab1 %>% select(-nums, -nums_loc, -prefs, -close_prefs, -text), here("Data", "tab1.csv"))
write.csv(tab2 %>% select(-scale_mentions_loc, -doll_mentions_loc, -mentions, -close_prefs), here("Data", "tab2.csv"))

